# Computations
import numpy as np
import pandas as pd
import scipy.stats as stats
# sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict, GridSearchCV, RandomizedSearchCV, KFold
from sklearn import metrics
from sklearn.utils.fixes import loguniform
# Tensorflow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
# Timer
from timeit import default_timer as timer
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## missingno
import missingno as msno
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex
## seaborn
import seaborn as sns
sns.set_context('paper', rc={'font.size':12,'axes.titlesize':14,'axes.labelsize':12})
sns.set_style('whitegrid')
## matplotlib
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
import matplotlib.colors
from pylab import rcParams
from matplotlib.font_manager import FontProperties
plt.rcParams['figure.figsize'] = 14, 8
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
# %config InlineBackend.figure_format = 'retina'
import warnings
warnings.filterwarnings("ignore")
In this article, we use Kaggle'sPima Indians Diabetes. The Pima indians are a group of Native Americans living in an area consisting of what is now central and southern Arizona. A variety of statistical methods are used here for predictions.
This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.
The datasets consist of several medical predictor variables and one target variable, Outcome. Predictor variables include the number of pregnancies the patient has had, their BMI, insulin level, age, and so on.
Data = pd.read_csv('pima-indians-diabetes-database/diabetes_mod.csv')
display(Data.head())
display(pd.DataFrame({'Number of Instances': [Data.shape[0]], 'Number of Attributes': [Data.shape[1]]}).style.hide_index())
| Pregnancies | Glucose | Blood Pressure | Skin Thickness | Insulin | BMI | Diabetes Pedigree Function | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 6.0 | 148.0 | 72.0 | 35.0 | 0.0 | 33.6 | 0.627 | 50.0 | 1 |
| 1 | 1.0 | 85.0 | 66.0 | 29.0 | 0.0 | 26.6 | 0.351 | 31.0 | 0 |
| 2 | 8.0 | 183.0 | 64.0 | 0.0 | 0.0 | 23.3 | 0.672 | 32.0 | 1 |
| 3 | 1.0 | 89.0 | 66.0 | 23.0 | 94.0 | 28.1 | 0.167 | 21.0 | 0 |
| 4 | 0.0 | 137.0 | 40.0 | 35.0 | 168.0 | 43.1 | 0.254 | 33.0 | 1 |
| Number of Instances | Number of Attributes |
|---|---|
| 768 | 9 |
| Feature | Explanations |
|---|---|
| Pregnancies | Number of times pregnant |
| Glucose | Plasma glucose concentration a 2 hours in an oral glucose tolerance test |
| Blood Pressure | Diastolic blood pressure (mm Hg) |
| Skin Thickness | Triceps skinfold thickness (mm) |
| Insulin | 2-Hour serum insulin (mu U/ml) |
| BMI | Body mass index (weight in kg/(height in m)^2) |
| Diabetes Pedigree Function | Diabetes pedigree function |
| Age | Age (years) |
| Outcome | Whether or not a patient has diabetes |
A multi-layer perceptron (MLP) is a class of feedforward artificial neural network (ANN). scikit-learn.org has a well-written article regarding MLP and interested readers are encouraged to see this article.
Splitting the data into X and y sets:
Target = 'Outcome'
X = Data.drop(columns = [Target])
y = Data[Target]
Labels = ['Non-Diabetic', 'Diabetic']
fig, ax = plt.subplots(figsize=(10,10))
Temp = X.var().sort_values(ascending = False).to_frame(name= 'Variance').round(2).T
_ = sns.heatmap(Temp, ax=ax, annot=True, square=True, cmap =sns.color_palette("OrRd", 20),
linewidths = 0.8, vmin=0, vmax=Temp.max(axis =1)[0], annot_kws={"size": 12.5},
cbar_kws={'label': 'Feature Variance', "aspect":80, "shrink": .4, "orientation": "horizontal"})
lb = [x.replace(' ','\n').replace('\nof\n',' of\n') for x in [item.get_text() for item in ax.get_xticklabels()]]
_ = ax.set_xticklabels(lb)
_ = ax.set_yticklabels('')
del Temp
We can standardize features by removing the mean and scaling to unit variance.
scaler = StandardScaler()
Temp = scaler.fit_transform(X)
X = pd.DataFrame(data = Temp, columns = X.columns)
del Temp
fig, ax = plt.subplots(figsize=(10,10))
Temp = X.var().sort_values(ascending = False).to_frame(name= 'Variance').round(2).T
_ = sns.heatmap(Temp, ax=ax, annot=True, square=True, cmap =sns.color_palette("Greens", 20),
linewidths = 0.8, vmin=0, vmax=Temp.max(axis =1)[0], annot_kws={"size": 12.5},
cbar_kws={'label': 'Feature Variance', "aspect":80, "shrink": .4, "orientation": "horizontal"})
lb = [x.replace(' ','\n').replace('\nof\n',' of\n') for x in [item.get_text() for item in ax.get_xticklabels()]]
_ = ax.set_xticklabels(lb)
_ = ax.set_yticklabels('')
del Temp
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
pd.DataFrame(data={'Set':['X_train','X_test','y_train','y_test'],
'Shape':[X_train.shape, X_test.shape, y_train.shape, y_test.shape]}).set_index('Set').T
| Set | X_train | X_test | y_train | y_test |
|---|---|---|---|---|
| Shape | (537, 8) | (231, 8) | (537,) | (231,) |
model = keras.Sequential(name = 'Binary_MLP')
model.add(layers.Dense(64, input_dim = X.shape[1], activation='relu', name='Layer1'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(64, activation='relu', name='Layer2'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1, activation='sigmoid', name='Layer3'))
model.summary()
tf.keras.utils.plot_model(model, show_shapes=True, show_layer_names=True, expand_nested = True, rankdir = 'LR')
Model: "Binary_MLP" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= Layer1 (Dense) (None, 64) 576 _________________________________________________________________ dropout (Dropout) (None, 64) 0 _________________________________________________________________ Layer2 (Dense) (None, 64) 4160 _________________________________________________________________ dropout_1 (Dropout) (None, 64) 0 _________________________________________________________________ Layer3 (Dense) (None, 1) 65 ================================================================= Total params: 4,801 Trainable params: 4,801 Non-trainable params: 0 _________________________________________________________________
# Number of iterations
IT = int(5e2)+1
model.compile(optimizer= 'rmsprop', loss='binary_crossentropy', metrics=['accuracy','mae', 'mse'])
# Train model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs= IT, batch_size=128, verbose = 0)
def Search_List(Key, List): return [s for s in List if Key in s]
Metrics_Names = {'loss':'Loss', 'accuracy':'Accuracy', 'mae':'MAE', 'mse':'MSE'}
def Table_modify(df, Metrics_Names = Metrics_Names):
df = df.rename(columns = Metrics_Names)
df = df.reindex(sorted(df.columns), axis=1)
df.insert(loc = 0, column = 'Iteration', value = np.arange(0, df.shape[0]), allow_duplicates=False)
return df
Validation_Table = Search_List('val_',history.history.keys())
Train_Table = list(set( history.history.keys()) - set(Validation_Table))
Validation_Table = pd.DataFrame(np.array([history.history[x] for x in Validation_Table]).T, columns = Validation_Table)
Train_Table = pd.DataFrame(np.array([history.history[x] for x in Train_Table]).T, columns = Train_Table)
Validation_Table.columns = [x.replace('val_','') for x in Validation_Table.columns]
Train_Table = Table_modify(Train_Table)
Validation_Table = Table_modify(Validation_Table)
# Train Set Score
score = model.evaluate(X_test, y_test, batch_size=128, verbose = 0)
score = pd.DataFrame(score, index = model.metrics_names).T
score.index = ['Train Set Score']
# Validation Set Score
Temp = model.evaluate(X_train, y_train, batch_size=128, verbose = 0)
Temp = pd.DataFrame(Temp, index = model.metrics_names).T
Temp.index = ['Validation Set Score']
score = score.append(Temp)
score.rename(columns= Metrics_Names, inplace = True)
score = score.reindex(sorted(score.columns), axis=1)
display(score.style.set_precision(4))
| Accuracy | Loss | MAE | MSE | |
|---|---|---|---|---|
| Train Set Score | 0.7446 | 0.6318 | 0.3094 | 0.1858 |
| Validation Set Score | 0.8678 | 0.3049 | 0.2104 | 0.0958 |
def Plot_history(history, Title = False, Table_Rows = 25):
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, column_widths=[0.6, 0.4],
specs=[[{"type": "scatter"},{"type": "table"}]])
# Left
fig.add_trace(go.Scatter(x= history['Iteration'].values, y= history['Loss'].values,
line=dict(color='OrangeRed', width= 1.5), name = 'Loss'), 1, 1)
fig.add_trace(go.Scatter(x= history['Iteration'].values, y= history['Accuracy'].values,
line=dict(color='MidnightBlue', width= 1.5), name = 'Accuracy'), 1, 1)
fig.add_trace(go.Scatter(x= history['Iteration'].values, y= history['MAE'].values,
line=dict(color='ForestGreen', width= 1.5), name = 'Mean Absolute Error (MAE)'), 1, 1)
fig.add_trace(go.Scatter(x= history['Iteration'].values, y= history['MSE'].values,
line=dict(color='purple', width= 1.5), name = 'Mean Squared Error (MSE)'), 1, 1)
fig.update_layout(legend=dict(x=0, y=1.1, traceorder='reversed', font_size=12),
dragmode='select', plot_bgcolor= 'white', height=600, hovermode='closest',
legend_orientation='h')
fig.update_xaxes(range=[history.Iteration.min(), history.Iteration.max()],
showgrid=True, gridwidth=1, gridcolor='Lightgray',
showline=True, linewidth=1, linecolor='Lightgray', mirror=True, row=1, col=1)
fig.update_yaxes(range=[0, 1], showgrid=True, gridwidth=1, gridcolor='Lightgray',
showline=True, linewidth=1, linecolor='Lightgray', mirror=True, row=1, col=1)
# Right
ind = np.linspace(0, history.shape[0], Table_Rows, endpoint = False).round(0).astype(int)
ind = np.append(ind, history.Iteration.values[-1])
history = history[history.index.isin(ind)]
Temp = []
for i in history.columns:
Temp.append(history.loc[:,i].astype(float).round(4).values)
fig.add_trace(go.Table(header=dict(values = list(history.columns), line_color='darkslategray',
fill_color='DimGray', align=['center','center'],
font=dict(color='white', size=12), height=25), columnwidth = [0.4, 0.4, 0.4, 0.4],
cells=dict(values=Temp, line_color='darkslategray', fill=dict(color=['WhiteSmoke', 'white']),
align=['center', 'center'], font_size=12,height=20)), 1, 2)
if Title != False:
fig.update_layout(plot_bgcolor= 'white',
title={'text': Title, 'x':0.46, 'y':0.94, 'xanchor': 'center', 'yanchor': 'top'},
yaxis_title='Frequency')
fig.show()
def Confusion_Matrix(Model, X, y, Labels, FG = (12, 4)):
fig, ax = plt.subplots(1, 2, figsize=FG)
y_pred = Model.predict(X)
if len(y.shape)> 1:
CM = metrics.confusion_matrix(y.argmax(axis = 1), y_pred.argmax(axis = 1))
else:
CM = metrics.confusion_matrix(y, np.round(y_pred))
_ = sns.heatmap(CM.round(2), annot=True, annot_kws={"size": 14}, cmap="Blues", ax = ax[0])
_ = ax[0].set_title('Confusion Matrix');
CM = CM.astype('float') / CM.sum(axis=1)[:, np.newaxis]
_ = sns.heatmap(CM.round(2), annot=True, annot_kws={"size": 14}, cmap="Greens", ax = ax[1],
linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": 1})
_ = ax[1].set_title('Normalized Confusion Matrix');
for a in ax:
_ = a.set_xlabel('Predicted labels')
_ = a.set_ylabel('True labels');
_ = a.xaxis.set_ticklabels(Labels)
_ = a.yaxis.set_ticklabels(Labels)
_ = a.set(aspect='equal')
return fig, ax
def ROC_Plot(y_test, Proba, pad = 1e-2, FS = 6):
# false positive rates, true positive rates and thresholds
fpr, tpr, threshold = metrics.roc_curve(y_test, Proba)
fig, ax = plt.subplots(1, 1, figsize=(FS, FS))
_ = ax.plot(fpr, tpr, lw=2, label = 'AUC = %0.2f' % metrics.auc(fpr, tpr))
_ = ax.plot([0, 1], [0, 1],'r--', lw=2)
_ = ax.legend(loc = 'lower right', fontsize = 14)
_ = ax.set_xlim([0-pad,1+pad])
_ = ax.set_ylim([0-pad,1+pad])
_ = ax.set_xlabel('False Positive Rate (FPR)')
_ = ax.set_ylabel('True Positive Rate (TPR)')
_ = ax.set_title('Receiver Operating Characteristic (ROC)', fontsize =16)
_ = ax.set(aspect='equal')
Plot_history(Train_Table, Title = 'Train Set')
Plot_history(Validation_Table, Title = 'Validation Set')
fig, _ = Confusion_Matrix(model, X_train, y_train, Labels)
# Font
font = FontProperties()
font.set_weight('bold')
fig.suptitle('Train Set', fontproperties=font, fontsize = 16)
fig, _ = Confusion_Matrix(model, X_test, y_test, Labels)
_ = fig.suptitle('Test Set', fontproperties=font, fontsize = 16)
ROC_Plot(y_test, model.predict(X_test), pad = 1e-2, FS = 6)